# currently, just deal with 337 query terms
queryTermList = []

inputQueryFileName = "/home/diaosi/gov2ClearYourMindAndDoItAgain/100KQueries_tail_100"

inputQueryHandler = open(inputQueryFileName,"r")
for line in inputQueryHandler.readlines():
    elements = line.strip().split(":")
    queryID = int(elements[0])
    
    data = elements[1]
    data = data.lower()
    
    for i in range(0,len(data)):
        # print "data[i]:",ord(data[i])
        if not ( (ord(data[i]) >= 48 and ord(data[i]) < 58) or (ord(data[i]) >= 65 and ord(data[i]) < 91) or (ord(data[i]) >= 97 and ord(data[i]) < 123) or (ord(data[i]) == 32) ):
            # Just replace them with a space.
            data = data[:i] + " " + data[i+1:]
    
    queryContent = data

    # has the variable queryContent and queryID
    print "processing qid:",queryID,"queryContent",queryContent
    queryContentElements = queryContent.strip().split(" ")
    for queryTerm in queryContentElements:
        if queryTerm.strip() == "":
            pass
        else:
            if queryTerm not in queryTermList:
                queryTermList.append(queryTerm)

queryTermList.sort()

outputFileName = "/home/diaosi/gov2ClearYourMindAndDoItAgain/queryTermsFor100KQueriesTail100"
outputFileHandler = open(outputFileName,"w")

for queryTerm in queryTermList:
    outputFileHandler.write(queryTerm + "\n")

outputFileHandler.close()